# filename=ManEng-Base-45G
# filename=Add_1GSJT_2GGen2
# filename=test_6_16_final_text
filename=all_data
echo ${filename}.list
input_path=/home/work_nfs8/xlgeng/new_workspace/wenet_gxl_salmonn4ft_LLM/examples/aishell/ft_LLM/data_list/all.text
# 去除 ^M的符号
# sed -i "s/^M//g" ${filename}.txt # 要手动输入，问ASR的人
# 去空格
echo "开始 去空格"
sed 's/ //g' ${filename}.list > ${filename}-nospace.txt
echo "完成 去空格"
# 字母转大写
echo "开始 字母转大写"
cat ${filename}-nospace.txt | tr [:lower:] [:upper:] > ${filename}-nospace-ABC.txt
echo "完成 字母转大写"
# 去特殊符号
echo "开始 去特殊符号"
./normalize.sh ${filename}-nospace-ABC.txt ${filename}-nospace-ABC-nospecial.txt
echo "完成 去特殊符号"
# 去长句
echo "开始 去长句"
awk '{if(length($0)<=500) print $0}' ${filename}-nospace-ABC-nospecial.txt > ${filename}-nospace-ABC-nospecial-cutlong.txt
echo "完成 去长句"
# 分词
echo "开始 分词"
/home/work_nfs3/fyu/tools/wordseg/word_seg4 /home/work_nfs4_ssd/yhliang/LanguageModel/lang/lang_test616_mix0.1_pruned_5e-11/words.txt ${filename}-nospace-ABC-nospecial.txt ${filename}-nospace-ABC-nospecial-cutlong-wordseg.txt
echo "完成 分词"
# 生成arpa
echo "开始 生成arpa"
. ./path.sh
./kenlm_train.sh ${filename}-nospace-ABC-nospecial-cutlong-wordseg.txt ./output_${filename}.arpa
echo "完成 生成arpa"
# 生成output文件夹
echo "开始 生成output文件夹"
#./format_lm.sh /home/work_nfs4_ssd/yhliang/workspace/online_system_dict/data/lang ./output_${filename}.arpa  /home/work_nfs4_ssd/yhliang/workspace/online_system_dict/lexicon.txt ./output_${filename}
echo "完成 生成output文件夹"

# 困惑度
# ngram -ppl test.txt -order 5 -lm train.arpa

